/*
 * Functions for adding vectors.
 *
 * \author Martin Albrecht <martinralbrecht@googlemail.com>
 *
 */

#ifndef M4RI_XOR_H
#define M4RI_XOR_H

/*******************************************************************
 *
 *                 M4RI:  Linear Algebra over GF(2)
 *
 *    Copyright (C) 2008-2013  Martin Albrecht <martinralbrecht@googlemail.com>
 *
 *  Distributed under the terms of the GNU General Public License (GPL)
 *  version 2 or higher.
 *
 *    This code is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *    General Public License for more details.
 *
 *  The full text of the GPL is available at:
 *
 *                  http://www.gnu.org/licenses/
 *
 ********************************************************************/

#include <m4ri/m4ri_config.h>

#if __M4RI_HAVE_SSE2
#include <emmintrin.h>
#endif

#include <m4ri/misc.h>

/**
 * Compute c[i] += t1[i] for 0 <= i < wide
 *
 */

static inline void
_mzd_combine(word* c, word const* t1, wi_t wide_in)
{
	wi_t wide = wide_in;
#if __M4RI_HAVE_SSE2
	/* assuming c, t1 are alligned the same way */

	if (__M4RI_ALIGNMENT(c, 16) == 8 && wide) {
		*c++ ^= *t1++;
		wide--;
	}

	__m128i* __c = (__m128i*)c;
	__m128i* __t1 = (__m128i*)t1;
	const __m128i* eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
	__m128i xmm1;

	while (__c < eof - 1) {
		xmm1 = _mm_xor_si128(*__c, *__t1++);
		*__c++ = xmm1;
		xmm1 = _mm_xor_si128(*__c, *__t1++);
		*__c++ = xmm1;
	}

	if (__c < eof) {
		xmm1 = _mm_xor_si128(*__c, *__t1++);
		*__c++ = xmm1;
	}

	c = (word*)__c;
	t1 = (word*)__t1;
	wide = ((sizeof(word) * wide) % 16) / sizeof(word);

	if (!wide) {
		__M4RI_DD_RAWROW(c, wide_in);
		return;
	}
#endif // __M4RI_HAVE_SSE2

	wi_t n = (wide + 7) / 8;
	switch (wide % 8) {
		case 0:
			do {
				*c++ ^= *t1++;
				case 7:
					*c++ ^= *t1++;
				case 6:
					*c++ ^= *t1++;
				case 5:
					*c++ ^= *t1++;
				case 4:
					*c++ ^= *t1++;
				case 3:
					*c++ ^= *t1++;
				case 2:
					*c++ ^= *t1++;
				case 1:
					*c++ ^= *t1++;
			} while (--n > 0);
	}
	__M4RI_DD_RAWROW(c, wide_in);
}

#define N 2
#include "xor_template.h"
#undef N

#define N 3
#include "xor_template.h"
#undef N

#define N 4
#include "xor_template.h"
#undef N

#define N 5
#include "xor_template.h"
#undef N

#define N 6
#include "xor_template.h"
#undef N

#define N 7
#include "xor_template.h"
#undef N

#define N 8
#include "xor_template.h"
#undef N

#endif // M4RI_XOR_H
